# wget https://huggingface.co/IDEA-Emdoor/DistilCodec-v1.0/resolve/main/model_config.json
# wget https://huggingface.co/IDEA-Emdoor/DistilCodec-v1.0/resolve/main/g_00204000

from distilcodec import DistilCodec, demo_for_generate_audio_codes
from transformers import AutoTokenizer, AutoModelForCausalLM

codec_model_config_path='model_config.json'
codec_ckpt_path = 'g_00204000'

codec = DistilCodec.from_pretrained(
    config_path=codec_model_config_path,
    model_path=codec_ckpt_path,
    use_generator=True,
    is_debug=False).eval()

tokenizer = AutoTokenizer.from_pretrained('mesolitica/Malaysian-TTS-1.7B-v1')
model = AutoModelForCausalLM.from_pretrained('mesolitica/Malaysian-TTS-1.7B-v1', torch_dtype = 'auto')
import soundfile as sf
import re
from tqdm import tqdm

speakers = [
    'husein',
    'idayu',
    'singaporean',
    'DisfluencySpeech',
    'singlish-speaker2050',
    'singlish-speaker2202',
    'haqkiem',
]

string = 'IC saya adalah, sembilan enam, kosong tiga, satu empat, one, one, one, one, A, B, C, D, D, yes, Husein is very cute, cute, cute.'

for s in tqdm(speakers):

    left = s +': ' + string
    prompt = f'<|im_start|>{left}<|speech_start|>'
    
    generate_kwargs = dict(
        **tokenizer(prompt, return_tensors = 'pt', add_special_tokens = False),
        max_new_tokens=1024,
        temperature=0.7,
        do_sample=True,
        repetition_penalty=1.1,
    )
    generation_output = model.generate(**generate_kwargs)
    speech_token = tokenizer.decode(generation_output[0]).split('<|speech_start|>')[-1].replace('<|endoftext|>', '')
    numbers = re.findall(r'speech_(\d+)', speech_token)
    d = list(map(int, numbers))
    y_gen = codec.decode_from_codes(
        d,
        minus_token_offset=False
    )
    sf.write(f'{s}.mp3', y_gen[0, 0].cpu().numpy(), 24000)
